Set up

Functions

This are a few customized functions that will be used during this analysis.

#function to show color palettes
my_color_pal <- function(colours, names_or_hex, borders = NULL, cex_label = 1, ncol = NULL) {

  # get hexs
  colours_hex <- unname(colours)
  # get names
  names_colours <- names(colours)

  # functions internal to scales::show_col()
  n <- length(colours)
  ncol <- ceiling(sqrt(n))
  nrow <- ceiling(n/ncol)

  colours <- c(colours, rep(NA, nrow * ncol - length(colours)))
  colours <- matrix(colours, ncol = ncol, byrow = TRUE)

  old <- par(pty = "s", mar = c(0, 0, 0, 0))
  on.exit(par(old))

  size <- max(dim(colours))
  plot(c(0, size), c(0, -size), type = "n", xlab = "", ylab = "", axes = FALSE)
  rect(col(colours) - 1, -row(colours) + 1, col(colours), -row(colours),
    col = colours, border = borders)
  # add conditional plotting of hex codes or names
  if (names_or_hex == "hex") {
    text(col(colours) - 0.5, -row(colours) + 0.5, colours_hex)
  } else if(names_or_hex == "names"){
    text(col(colours) - 0.5, -row(colours) + 0.5, names_colours)
  } else {
    text(col(colours) - 0.5, -row(colours) + 0.5, paste0(names_colours,'\n',colours_hex))
  }
}
#function to rename counts.df to make some graphs
label_cond <- function(x) { samples.info$condition[x] }

# relabel_id <- function(df, col) {
#   if (exists("sample.info") && all(samples.info$Sample %in% df$col)) {
#     matching_row_names <- samples.info$Sample[samples.info$Sample %in% df$col]
#     df$col[df$col %in% matching_row_names] <- names(samples.info)[matching_row_names]
#   }
#   
#   return(df)
# }

Data import

Keep in mind that you need a folder called images and another one called stats. In this folder we will save all the results from our analysis. We define them to use them in all of our analysis.

setwd(file.path(wd))
imgdir  <- "./images/"
statdir <- "./stats/"

Load a sample info file with all the information from the ATAC seq.

library(readr)
samples.info <- read_delim("./info_embryo.csv",
                           delim = ";", 
                           escape_double = FALSE, 
                           trim_ws = TRUE)
head(samples.info,2)
## # A tibble: 2 × 4
##   stage Name               Replicate Sample  
##   <chr> <chr>                  <dbl> <chr>   
## 1 emb4  embryo_stage4_rep1         1 e4_1_001
## 2 emb4  embryo_stage4_rep2         2 e4_2_001

We make a matrix with the consensus peaks counts. We have the information speared in different files (one per stage) so we manege them in one matrix.

# Specify the folder path
counts_path <- "./counts"
counts.file <- paste0(counts_path, "/merged_embryos.counts")

counts <- as.matrix(read.csv(counts.file,row.names=1,sep = "\t"))
str(counts)
##  int [1:49253, 1:12] 240 505 304 680 52 126 37 23 209 9 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:49253] "peak1" "peak2" "peak3" "peak4" ...
##   ..$ : chr [1:12] "e10_1_001" "e10_2_001" "e12_1_001" "e12_2_001" ...
head(counts[,c(1, 2, 3)],2)
##       e10_1_001 e10_2_001 e12_1_001
## peak1       240       253       236
## peak2       505       390       316

We import some peak information (ampliar depsres la taula…)

peaks_width <- read_table("pfla_all_peaks_width.txt",
                          col_names = c("peak_id", "width"), 
                          col_types = cols(X2 = col_integer()))

peaks_annot <- read_delim("peaks_to_genes_annotation.tbl",
                          delim = "\t", 
                          escape_double = FALSE,
                          col_names = c("peak_id", "cdip_gene", "uniprot_id", "FBgn_id", "FB_symbol", "FB_name"), 
                          trim_ws = TRUE)

peaks_info <- merge(peaks_annot, 
                    peaks_width, 
                    by.x="peak_id")

Parameters set up

This group of data can be used to define some variables of your experiment and some aesthetic parameter for your graphs.

Set the statistical parameters for the analysis:

MIN.FC <- 0.5 # Define Fold Change (FC) threshold
MIN.PV <- 0.05 # Define p-value (PV) threshold
cat(paste0("The Fold change threshold this analysis is ", MIN.FC, 
             "\nThe máximum p-value considired sigificative in this analysis is ", MIN.PV))
## The Fold change threshold this analysis is 0.5
## The máximum p-value considired sigificative in this analysis is 0.05

We establish the comparisons we wish to study. We define our conditions to study as condition

condition_factors <- factor(samples.info$stage,
                            levels=c( "emb4","emb6","emb8","emb10", "emb12", "emb14" ))
samples.info$condition <- condition_factors
cat(paste0('Condition: ', levels(condition_factors), '\n'))
## Condition: emb4
##  Condition: emb6
##  Condition: emb8
##  Condition: emb10
##  Condition: emb12
##  Condition: emb14

Set some color parameters to have coherent graphs:

# IMPORTANT: The name of your conditions must be the same defined in samples.info
condition.colors <- c("emb4"="#F4AE3E",
                      "emb6"="#8EA4D2",
                      "emb8"="#4A314D",
                      "emb10"="#F59CA9",
                      "emb12"="#32936F",
                      "emb14"="#2B3A67") 
my_color_pal(colours = condition.colors,
             names_or_hex = "other", #names=condition hex=color_code other=both
             borders=NA) 

# Counts exploration

We explore the counts data without any transformation

Initial exploration of the raw data. In this section we will explore if there are some inconsistencies in our war data and it it’s necessary to eliminate some replicas.

We filter the rows with no counts or only 1 count. With this filter we eliminate some useless data and speed up the downstream processes.

keep <- rowSums(counts) > 2
counts <- counts[keep,]
#merge of the raw counts with the sample information. We also transform counts to a log2 scale for better visualization
dt.exp_raw <- stack(as.data.frame(counts))
dt.exp_raw$values <- log2(dt.exp_raw$values + 1e-3)
dt.exp_raw$gp <- label_cond(dt.exp_raw$ind)
colnames(dt.exp_raw) <- c('log2exp','sampleID','colorID')

We plot the variation of raw counts across our samples

boxplot.expr_raw <- ggplot(dt.exp_raw, aes(x=sampleID, y=log2exp, fill=colorID)) +
  geom_violin(show.legend = FALSE) +
  theme_light() +
  theme(plot.title = element_text(hjust=0.5),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values=condition.colors) +
  xlab("Samples") +
  ylab("log2 raw counts") +
  ggtitle("Boxplot of raw counts")
print(boxplot.expr_raw)

It’s also important to know the library size differences

libsize <- as.data.frame(colSums(counts))
colnames(libsize) <- "LibSize"
libsize$sampleID <- as.factor(row.names(libsize))
libsize$colorID <- label_cond(libsize$sampleID)


libsize_plot <- ggplot(libsize, aes(x=sampleID, y=LibSize, fill=colorID)) +
  geom_bar(stat="identity") +
  scale_fill_manual(values=condition.colors) +
  labs(title="Library size by sample") +
  ylab("Library Size") +
  xlab("Sample") +
  coord_flip() + theme_light() +
  theme(legend.position="none")
print(libsize_plot)

Establishing the assay conditions & Normalitzation

Our first normalization is by peak width. We will divide the counts by the amplitude of the peak

width_values <- peaks_info$width
names(width_values) <- peaks_info$peak_id
counts <- sweep(counts, 2, width_values, "/")

From our matrix we create an object called dge containing information from our counts and sample information.

dge <- DGEList(counts=counts,
               samples=samples.info,
               group=samples.info$condition)
str(dge)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
##   ..@ .Data:List of 2
##   .. ..$ : num [1:49253, 1:12] 0.787 2.087 2.171 1.071 0.109 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:49253] "peak1" "peak2" "peak3" "peak4" ...
##   .. .. .. ..$ : chr [1:12] "e10_1_001" "e10_2_001" "e12_1_001" "e12_2_001" ...
##   .. ..$ :'data.frame':  12 obs. of  8 variables:
##   .. .. ..$ group       : Factor w/ 6 levels "emb4","emb6",..: 1 1 2 2 3 3 4 4 5 5 ...
##   .. .. ..$ lib.size    : num [1:12] 17782 16938 14637 17050 18853 ...
##   .. .. ..$ norm.factors: num [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   .. .. ..$ stage       : chr [1:12] "emb4" "emb4" "emb6" "emb6" ...
##   .. .. ..$ Name        : chr [1:12] "embryo_stage4_rep1" "embryo_stage4_rep2" "embryo_stage6_rep1" "embryo_stage6_rep6" ...
##   .. .. ..$ Replicate   : num [1:12] 1 2 1 2 1 2 1 2 1 2 ...
##   .. .. ..$ Sample      : chr [1:12] "e4_1_001" "e4_2_001" "e6_1_001" "e6_2_001" ...
##   .. .. ..$ condition   : Factor w/ 6 levels "emb4","emb6",..: 1 1 2 2 3 3 4 4 5 5 ...
##   ..$ names: chr [1:2] "counts" "samples"

We most also define the design of our assay. In limma we define a matrix-like design that will indicate which are our hypothesis contrasts. This is one of the key steeps in the limma analysis.

design.condition   <- model.matrix( ~0+ samples.info$condition, dge )
colnames(design.condition) <- gsub("samples.info[$]condition", "", colnames(design.condition))
colnames(design.condition) <- gsub("samples.info[$]sample", "", colnames(design.condition))
names(attr(design.condition,"contrasts")) <- c("condition")
rownames(design.condition) <-samples.info$Sample
design.condition
##           emb4 emb6 emb8 emb10 emb12 emb14
## e4_1_001     1    0    0     0     0     0
## e4_2_001     1    0    0     0     0     0
## e6_1_001     0    1    0     0     0     0
## e6_2_001     0    1    0     0     0     0
## e8_1_001     0    0    1     0     0     0
## e8_2_001     0    0    1     0     0     0
## e10_1_001    0    0    0     1     0     0
## e10_2_001    0    0    0     1     0     0
## e12_1_001    0    0    0     0     1     0
## e12_2_001    0    0    0     0     1     0
## e14_1_001    0    0    0     0     0     1
## e14_2_001    0    0    0     0     0     1
## attr(,"assign")
## [1] 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"

If the sequencing depth is reasonably consistent across the samples, then the simplest and most robust approach is to use limma-trend. This approach will usually work well if the ratio of the largest library size to the smallest is not more than about 3-fold. We can check this in [Data exploration].

print(paste0('The largest library have: ',max(libsize$LibSize),' counts'))
## [1] "The largest library have: 7615361 counts"
print(paste0('The smallest library have: ',min(libsize$LibSize),' counts'))
## [1] "The smallest library have: 2703453 counts"
print(paste0("That's ", round(max(libsize$LibSize)/min(libsize$LibSize),1), " times bigger"))
## [1] "That's 2.8 times bigger"

In the limma-trend, the counts are normalized and converted to logCPM values.

dge <- calcNormFactors(dge, method="TMM")
logCPM <- cpm(dge, log=TRUE, prior.count=3)

After normalization we can observe the counts distribution varied and also how the normalized libraries are corrected.

#merge of the logCPM with the sample information. 
dt.exp <- stack(as.data.frame(logCPM))
dt.exp$gp <- label_cond(dt.exp$ind)
colnames(dt.exp) <- c('log2exp','sampleID','colorID')
boxplot.expr_raw <- ggplot(dt.exp, aes(x=sampleID, y=log2exp, fill=colorID)) +
  geom_violin(show.legend = FALSE) +
  theme_light() +
  theme(plot.title = element_text(hjust=0.5),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values=condition.colors) +
  xlab("Samples") +
  ylab("log2 CPM") +
  ggtitle("Boxplot of normalized counts")
print(boxplot.expr_raw)

It’s also important to know the library size differences

libsize <- as.data.frame(colSums(logCPM))
colnames(libsize) <- "LibSize"
libsize$sampleID <- as.factor(row.names(libsize))
libsize$colorID <- label_cond(libsize$sampleID)


libsize_plot <- ggplot(libsize, aes(x=sampleID, y=LibSize, fill=colorID)) +
  geom_bar(stat="identity") +
  scale_fill_manual(values=condition.colors) +
  labs(title="Library size by sample after normalitzation") +
  ylab("Library Size") +
  xlab("Sample") +
  coord_flip() + theme_light() +
  theme(legend.position="none")
print(libsize_plot)

We observe the PCA distribution of our samples

pcaData <- prcomp(counts, scale=T)
pcaData.df <- as.data.frame(pcaData$rotation)
percentVar <- round(summary(pcaData)$importance[2,]*100,2)

pcaData.df$sampleID <- as.factor(row.names(pcaData.df))
pcaData.df$colorID <- label_cond(pcaData.df$sampleID)


pca.expr_raw <- ggplot(pcaData.df, aes(x=PC1,y=PC2, color=colorID, label=sampleID)) +
  geom_point(size=3) +
  scale_color_manual(values=condition.colors) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  geom_label_repel(aes(label = sampleID),
                   box.padding   = 0.35,
                   point.padding = 0.5,
                   segment.color = 'grey50') +
  theme_light() +
  theme(legend.position="none")
print(pca.expr_raw)

We can also see the sample distribution in a dendrogram.

dist <- as.dist(1-cor(counts, method="spearman"))
distclust <- hclust(dist)
dendrogram <- as.dendrogram(distclust, hang=0.1)
labels_colors(dendrogram) <- condition.colors[samples.info$condition][order.dendrogram(dendrogram)]
dend_raw <- function(){plot(dendrogram,
                 horiz = FALSE,
                 yaxt='n', ann=FALSE)}
dend_raw()

limma analysis

Limma is a package for the analysis of gene expression data arising from microarray or RNA-seq technologies. A core capability is the use of linear models to assess differential expression in the context of multi-factor designed experiments. In the limma approach to RNA-seq, read counts are converted to log2-counts-per-million (logCPM) and the mean-variance relationship is modeled either with precision weights or with an empirical Bayes prior trend.

We fit our data to limma linear model

contrasts= c(paste("emb4","emb6", sep="-"),
             paste("emb6","emb8", sep="-"),
             paste("emb8","emb10", sep="-"),
             paste("emb10","emb12", sep="-"),
             paste("emb12","emb14", sep="-"))
contrasts.matrix <- makeContrasts(contrasts=contrasts, levels=design.condition)

fitt <- lmFit(logCPM, design.condition)
fitt <- contrasts.fit(fitt, contrasts.matrix)
fitt <- eBayes(fitt, trend=TRUE)
summary(decideTests(fitt))
##        emb4-emb6 emb6-emb8 emb8-emb10 emb10-emb12 emb12-emb14
## Down           0         0       1290           2           1
## NotSig     49252     49253      46918       49227       49250
## Up             1         0       1045          24           2

We plot how many new oppened peaks we have per stage. We define as new open the unregulated peaks and new closed the dowregulated peaks.

de_df <- data.frame(t(summary(decideTests(fitt))))
colnames(de_df) <- c("stg_trans", "exp", "n")

filtered_df <- de_df[de_df$exp != "NotSig", ]

ggplot(filtered_df, aes(x = stg_trans, y = n, color = exp, group = exp)) +
  geom_line() +
  geom_point() +
  labs(title = "Line Plot of n vs stg_trans",
       x = "Satge transition",
       y = "Number of differentally regulated peaks",
       color = "Chromatin state") +
  scale_color_manual(values = c("Down" = "#67B3E0", "Up" = "#FE938C"),
                     labels = c("Down" = "Closed", "Up" = "Opened")) +
  theme_minimal()

make_toptable_arraysonly <- function (thyset,de.efit,expr.raw,thycontrast,
                                      thylbl="",thyfun="",max.hmp.ROWS=100) {
  cat("# Running on ",thyset,".limma.",thylbl,thycontrast,"\n",sep="");
  top.res <- topTable(de.efit, coef=thycontrast, adjust="fdr", sort.by="B", number=Inf)
  top.res$contig.ID <- rownames(top.res)
  top.res <- left_join(top.res,annotations_table,by="contig.ID")
  top.res$rank <- ifelse(is.na(top.res$P.Value) | is.na(top.res$logFC), NA,
                         # ifelse(is.na(top.res$logFC), -1e160, -1e150 * top.res$logFC),
                         -log10(top.res$P.Value)*top.res$logFC *
                           ifelse(top.res$P.Value <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
                                  ifelse(top.res$adj.P.Val <= MIN.PV, 1e4, 1e2), 1));
  top.res$DGE.pval <- as.factor(
    ifelse(top.res$P.Value <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
           ifelse(top.res$logFC >= MIN.FC, "up", "down"), "no-sig") );
  top.res$DGE.padj <- as.factor(
    ifelse(top.res$adj.P.Val <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
           ifelse(top.res$logFC >= MIN.FC, "UP", "DOWN"), "no-sig") );
  rownames(top.res) <- top.res$contig.ID
  contrast.Signif <- c( top.res$DGE.padj == "UP" | top.res$DGE.padj == "DOWN" )
  contrast.Signif.ids <- top.res$contig.ID[contrast.Signif]
  contrast.signif <- c( top.res$DGE.pval == "up" | top.res$DGE.pval == "down" )
  contrast.signif.ids <- top.res$contig.ID[contrast.signif]
  contrast.Tbl <- cbind(top.res[ contrast.Signif.ids, ],
                        expr.raw[ contrast.Signif.ids, ])
  contrast.tbl <- cbind(top.res[ contrast.signif.ids, ],
                        expr.raw[ contrast.signif.ids, ])
  cRows <- nrow(contrast.Tbl)
  crows <- nrow(contrast.tbl)             
  cmax.Rows <- min(cRows,max.hmp.ROWS) #100)
  cmax.rows <- min(crows,max.hmp.ROWS) #100)
  cat("# Heatmap for ",thyset,".limma.",thylbl,thycontrast,
      " nrows=",cRows,"/",cmax.Rows," : ",crows,"/",cmax.rows,"\n",sep="");
  contrast.Tmp <- (contrast.Tbl[with(contrast.Tbl,
                                     order(abs(contrast.Tbl$rank), # logFC),
                                           decreasing=TRUE )),]
  )[1:cmax.Rows,]
  contrast.tmp <- (contrast.tbl[with(contrast.tbl,
                                     order(abs(contrast.tbl$rank), # logFC),
                                           decreasing=TRUE )),]
  )[1:cmax.rows,]
  cond.select <- c (sub("_vs_.*", "", thycontrast), sub(".*_vs_", "", thycontrast))
  barcode.select <- samples.info[samples.info$condition %in% cond.select,]$barcode
  pHM1 <-pheatmap::pheatmap(contrast.Tmp[, barcode.select],
                            labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
                            labels_row = contrast.Tmp[, "contig.ID"],
                            angle_col=45,
                            main = paste0("Most varying ",cmax.Rows," of ",cRows," significant markers\nfor ",
                                          thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
  png(file=paste0(imgdir,"limma_heatmaps_",thyfun,"expression_significantgenesonly.",thyset,".limma.",thylbl,".",thycontrast,".png"),
      res=600, height=max(4,(20 * cmax.Rows/100)), width=10, unit="in", pointsize=10);
  grid::grid.newpage()
  grid::grid.draw(pHM1$gtable)
  dev.off();
  
  pHM2 <- pheatmap::pheatmap(contrast.Tmp[, barcode.select],
                             labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
                             labels_row = contrast.Tmp[, "Human.homolog"],
                             angle_col=45,
                             main = paste0("Most varying ",cmax.Rows," of ",cRows," significant markers\nfor ",
                                           thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
  png(file=paste0(imgdir,"limma_heatmaps_with_homologs_",thyfun,"expression_significantgenesonly.",thyset,".limma.",thylbl,".",thycontrast,".png"),
      res=600, height=max(4,(20 * cmax.Rows/100)), width=10, unit="in", pointsize=10);
  grid::grid.newpage()
  grid::grid.draw(pHM2$gtable)
  dev.off();
  
  phm1 <- pheatmap::pheatmap(contrast.tmp[, barcode.select],
                             labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
                             labels_row = contrast.tmp[, "contig.ID"],
                             angle_col=45,
                             main = paste0("Most varying ",cmax.rows," of ",cRows," markers\n for ",
                                           thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
  png(file=paste0(imgdir,"limma_heatmaps_",thyfun,"expression_topmostvaryinggenes.",thyset,".limma.",thylbl,".",thycontrast,".png"),
      res=600, height=max(4,(20 * cmax.rows/100)), width=10, unit="in", pointsize=10);
  grid::grid.newpage()
  grid::grid.draw(phm1$gtable)
  dev.off();
  
  phm2 <-pheatmap::pheatmap(contrast.tmp[, barcode.select],
                            labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
                            labels_row = contrast.tmp[, "Human.homolog"],
                            angle_col=45,
                            main = paste0("Most varying ",cmax.rows," of ",cRows," markers\n for ",
                                          thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
  png(file=paste0(imgdir,"limma_heatmaps_with_homologs_",thyfun,"expression_topmostvaryinggenes.",thyset,".limma.",thylbl,".",thycontrast,".png"),
      res=600, height=max(4,(20 * cmax.rows/100)), width=10, unit="in", pointsize=10);
  grid::grid.newpage()
  grid::grid.draw(phm2$gtable)
  dev.off();

}

We generate a table with the results:

make_top_table <- function(fit_model, contrast) {
  top.res.t <- topTable(fit_model, coef=contrast, 
                     adjust.method = "fdr", sort.by="B",
                     number=Inf)
  top.res.t$rank <- ifelse(is.na(top.res.t$P.Value) | is.na(top.res.t$logFC), NA,
                         -log10(top.res.t$P.Value)*top.res.t$logFC *
                           ifelse(top.res.t$P.Value <= MIN.PV,
                                  ifelse(top.res.t$adj.P.Val <= MIN.PV, 1e4, 1e2), 1));
  top.res.t$DGE.pval <- as.factor(
    ifelse(top.res.t$P.Value <= MIN.PV,
           ifelse(top.res.t$logFC >= MIN.FC, "up", "down"), "no-sig") );
  top.res.t$DGE.padj <- as.factor(
    ifelse(top.res.t$adj.P.Val <= MIN.PV,
           ifelse(top.res.t$logFC >= MIN.FC, "UP", "DOWN"), "no-sig") );
  
  UPDN <- c(nrow(top.res.t[ top.res.t$DGE.padj == "UP", ]),
            nrow(top.res.t[ top.res.t$DGE.padj == "DOWN", ]))
  updn <- c(nrow(top.res.t[ top.res.t$DGE.pval == "up", ]),
            nrow(top.res.t[ top.res.t$DGE.pval == "down", ]))
  
  top.res.t
}

top_table_list <- lapply(contrasts, 
                         function(x) make_top_table(fitt, x))
names(top_table_list) <- contrasts

We add the gene anotation information to the table

top_table_list_annot <- lapply(contrasts,
                               function(x) {
                                 df = tibble::rownames_to_column(top_table_list[[x]], "peak_id")
                                 merge(df, peaks_annot, by.x="peak_id")
                               })
names(top_table_list_annot) <- contrasts

We plot the results in a Volcano plot:

my_volcanoplot <- function(top.res.table, contrast_name) {
  #define some values for the caption
  UPDN <- c(nrow(top.res.table[ top.res.table$DGE.padj == "UP", ]),
            nrow(top.res.table[ top.res.table$DGE.padj == "DOWN", ]))
  
  EnhancedVolcano(data.frame(top.res.table), x = 'logFC', y = 'adj.P.Val',
                  lab = rownames(top.res.table),
                  xlab = bquote(~Log[2]~ 'fold change'),
                  ylab = bquote(~-Log[10]~adjusted~italic(P)),
                  xlim = c(min(top.res.table$logFC, na.rm = TRUE) - 0.5, max(top.res.table$logFC, na.rm = TRUE) + 0.5),
                  ylim = c(0, max(-log10(top.res.table$adj.P.Val), na.rm = TRUE) + 1),
                  pCutoff = MIN.PV, FCcutoff = MIN.FC, pointSize = 1.0, labSize = 2.0,
                  title = "Volcano Plot",
                  subtitle = contrast_name,
                  caption = paste0('log2 FC cutoff: ', MIN.FC, '; p-value cutoff: ',
                                   MIN.PV, '\nTotal = ', nrow(top.res.table),
                                   ' markers  [ ',UPDN[1],'UP, ',UPDN[2],'DOWN ]'),
                  legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0,
                  drawConnectors = TRUE, widthConnectors = 0.25,
                  colConnectors = 'grey30')}
volcano_plot_list <- lapply(1:length(top_table_list),
       function(i) {
         my_volcanoplot(top.res.table=top_table_list[[i]],
                        contrast_name=names(top_table_list)[[i]])
       })

print(volcano_plot_list)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

MFFuzz

library("Mfuzz")
library("Biobase")
library("stringr")

We are going to use the normalized counts in CPMs saved as logCPM (see Normalitzation secction). We collapse the biological replicates to do this experiment because we normalized them and we have seen that they are not too different.

logCPM_collapsed <- matrix(0, nrow = nrow(logCPM), ncol = length(levels(condition_factors)))
colnames(logCPM_collapsed) <- levels(condition_factors)
rownames(logCPM_collapsed) <- rownames(logCPM)
for (i in 1:length(levels(condition_factors))) {
  names_before_underscore <- str_extract(colnames(logCPM), "^[^_]+")
  unique_names <- unique(names_before_underscore)
  cols_to_sum <- startsWith(names_before_underscore, unique_names[i])
  logCPM_collapsed[, i] <- rowSums(logCPM[, cols_to_sum])
}

We save the data in a new object using ExpressionSet

normalizedSet <- ExpressionSet(assayData=logCPM_collapsed)

We filter any possible NA values. We also standardize the expression values of every peak so that the average expression value for each peak is zero and the standard deviation of its expression profile is one

embryo.r <- filter.NA(normalizedSet, thres=0.5)
## 0 genes excluded.
embryo.f <- fill.NA(embryo.r,mode="knnw")
embryo.s <- standardise(embryo.f)

We optimize the parameters for the MFuzz analysis. we will use two parameters: - fuzzifier value (m): We will determine the optimal setting of fuzzifie value using mestimate. This will be only an estimation since the final result tuned depending on the performance of the function. - number of clusters: final number of clusters. This will be determined using repeated soft clustering for a range of cluster numbers. This will report the minimum centroid distance. The minimum centroid distance can be used as cluster validity index. For an optimal cluster number, we may see a ‘drop’ of minimum centroid distance wh plotted versus a range of cluster number and a slower decrease of the minimum centroid distance for higher cluster number.

m_est <- mestimate(embryo.s)
m_est
## [1] 1.70376
m=1.5

plot_opt_cluster  <- Dmin(embryo.f,m=m,crange=seq(5,40,5),repeats=30,visu=TRUE)

print(plot_opt_cluster)
## [1] 0.36083955 0.19994997 0.14980903 0.13156243 0.11949226 0.11002667 0.10342429
## [8] 0.09765822

We define the final parameters for the Mfuzz analysis

m=1.5
cluster=35
mf <- mfuzz(embryo.s,c=cluster,m=m)
mfuzz.plot(embryo.s,cl=mf, time.labels=c("emb4","emb6","emb8","emb10","emb12","emb14"),new.window=FALSE)

The results are stored in the mf object. Some interesting metrics are: - centers: the final cluster centers. - size: the number of data points in each cluster of the closest hard clustering. - cluster: a vector of integers containing the indices of the clusters where the data points are assigned to for the closest hard clustering, as obtained by assigning points to the (first) class with maximal membership. - membership: a matrix with the membership values of the data points to the clusters.

We will make a data frame with the cluster assigned to each peak and add the annotation information. This will be useful for downstream analysis like GO or pathway enrichment.

mf_clusters <- data.frame(peak=names(mf$cluster), clusters = as.numeric(mf$cluster))
colnames(mf_clusters) <- c("peak_id", "cluster")
mf_clusters_annot <- merge(mf_clusters, peaks_annot, by.x="peak_id")